version 15

*** set the folder with the data here

* cd "C:\myfolder"


*** prepare temporary files for merging later
* (this is needed since the basic 'merge' command in Stata requires key variables of the same name in master and using)

import delim COMEPELDA_aggregate_v1.00.csv , clear varnam(1) encoding("UTF-8") case(preserve)
keep IDAD pName pAlliance01 pdNofCandSel pdNofCandEl pdSeats
gen IDADfocLag = IDAD
gen IDADfocLead = IDAD
save "tmpaggregate.dta", replace

import delim COMEPELDA_candidates_v1.00.csv , clear varnam(1) encoding("UTF-8") case(preserve)
keep IDCD ListRankSel OrderElec
gen IDCDfocLag = IDCD
gen IDCDfocLead = IDCD
save "tmpcandidates.dta", replace

*** 

*Adding the MEP type variable from the MEP file
import delim COMEPELDA_meps_v1.00.csv, clear varnam(1) encoding("UTF-8") case(preserve)

* Aim is selecting first-term MEPs who re-run
drop if EP == 8 // can't look at this because data do not cover 2019 election yet
* drop those who were (are known to have been, from the data) members in previous EP:
*	(there might be cases of membership only in earlier EPs other than the previous one, but we don't know that for all terms)
drop if EP == 5 & InEP4 == 1
drop if EP == 6 & (InEP4 == 1 | InEP5 == 1)
drop if EP == 7 & (InEP4 == 1 | InEP5 == 1 | InEP6 == 1)
drop if EpisType == "app EU enl" | EpisType == "observ" // there is no previous EP election for these
tab EpisType if Run01Lag == 0
drop if Run01Lag == 0 // a few MEPs did not run in previous election
drop if Run01Lead == 0 // keep those who re-ran


* check for duplicate entries for the same person in the same EP
bysort EP IDmep : gen dup = cond(_N==1,0,_n)
drop if dup == 1 // drop the first spell of Daniel Van der Stoep who was initially elected, resigned and then returned after Lisbon appointment

di _N
sum ID*

*** merge the various datasets together
merge m:1 IDADfocLag using "tmpaggregate.dta", ///
	keep(3) assert(2 3) nogen
ren (pName pAlliance01 pdNofCandSel pdNofCandEl pdSeats) lag_= // (list all variables from the using data here)

merge m:1 IDADfocLead using "tmpaggregate.dta", ///
	keep(3) assert(2 3) nogen
ren (pName pAlliance01 pdNofCandSel pdNofCandEl pdSeats) lead_=

merge m:1 IDCDfocLag using "tmpcandidates.dta", ///
	keep(3) assert(2 3) nogen
ren (ListRankSel OrderElec) lag_=

merge m:1 IDCDfocLead using "tmpcandidates.dta", ///
	keep(3) assert(2 3) nogen
ren (ListRankSel OrderElec) lead_=

* add external data with parl. activities and additional information on the MEPs
merge 1:1 IDmep EP using "reselection_externaldata.dta", ///
	keep(3) assert(2 3) nogen

* remove independent candidates (there is no list here)
drop if IDptyLag < -9999 | IDptyLead  < -9999 

* remove if alliance status switched between elections
drop if lag_pAlliance01  != lead_pAlliance01 

* remove if list type (main vs successor) changes
drop if SuccListLag != SuccListLead

* remove if party (ID) changes 
drop if IDptyLag != IDptyLead 

* remove if district changes 
drop if dNameLag != dNameLead 

assert lead_pName != ""
count if lag_pName == ""
count if lag_pName != lead_pName  

* intra-party competition
gen lag_logipcomp = log(lag_pdNofCandSel/lag_pdSeats) 
replace lag_logipcomp = log(lag_pdNofCandSel) if lag_pdSeats == 0 // set denominator to 1 if no seats won previously

* dummy for replacements
tab EpisT, m
gen replacem = 0
replace replacem = 1 if EpisT != "init elec"

* check for duplicates in lag of Order of Election within lists
byso IDptyLag ElYearLag dNameLag  lag_OrderElec: gen tmp = _N if lag_OrderElec != .
assert tmp == 1 | tmp == .
tab replacem if tmp == .
drop tmp
assert lag_Order <= lag_pdSeats if lag_Order < .


drop if terms_served2 > 1 // keep only first-term MEPs

gen lag_Ord_First = lag_Order
recode lag_Ord_First (1=1) (nonmiss=0)
replace lag_Ord_First = 0 if replacem == 1
gen lag_Ord_Relative = lag_Order/lag_pdSeats
replace lag_Ord_Relat = 1 if replacem == 1

sum lag_Ord*

gen lag_ListRank_First = lag_ListRankSel
recode lag_ListRank_First (1=1) (nonmiss=0)

gen lag_ListRank_Relative = lag_ListRankSel/lag_pdNofCandSel
replace lag_ListRank_Relat = 1 if replacem == 1
sum lag_ListRank*

* dependent variables
gen Demotion_raw = lead_ListRankSel - lag_ListRankSel
gen Demotion_dummy = sign(Demotion_raw)
recode Demotion_d (-1 = 0)
gen move3_ListRank = -sign(Demotion_raw) // -1 is down/worse and 1 is up/better

tab cName if Demotion_raw == . // with the exception of Italy (some missing values), these are structural zeros because there are no (ordered) lists in these countries
drop if Demotion_raw == .


byso EP: egen tmp = max(Duration2)
gen reldur2 = Duration2/tmp
drop tmp

* drop MEPs with short mandates (less than a quarter of the full term)
drop if reldur2 < .25 // (normalization would be questionable if spells too short)

* recode activity vars
foreach v of var speeches Nb_PQs2 rapports2 {
gen tmp = `v'/reldur2 // normalize by relative duration

egen tmp2 = pctile(tmp), p(33.3) // terciles
egen tmp3 = pctile(tmp), p(66.6)
gen ter_`v' = 1 if tmp <= tmp2
replace ter_`v' = 2 if tmp > tmp2 & tmp <= tmp3
replace ter_`v' = 3 if tmp > tmp3 & tmp < .

drop tmp*
}

tab1 ter_* // (the lower tercile for rapports is between 0 and 1)


global ctrls = "leadership2 age female2 i.ElYearLead replace lag_logipcomp  lag_ListRank_First lag_ListRank_Rel lag_Ord_First lag_Ord_Rel"

sum Demotion_d move ter_* $ctrls

save "reselection_dataforanalysis.dta", replace
erase "tmpaggregate.dta"
erase "tmpcandidates.dta"

